library(rhdf5)
library(dplyr)
library(tidyr)
library(caret)
library(corrplot)
library(rpart)
library(ggplot2)
The data source is supplied by Kaggle: https://www.kaggle.com/c/two-sigma-financial-modeling/downloads/train.h5.zip
rm(list=ls())
setwd("~/Documents")
if (!file.exists("KaggleProject")) dir.create("KaggleProject")
fileUrl <- "https://www.kaggle.com/c/two-sigma-financial-modeling/downloads/train.h5.zip"
download.file(fileUrl, destfile = "./KaggleProject/train.h5.zip")
unzip("./KaggleProject/train.h5.zip",exdir="./KaggleProject")
h5ls("train.h5")
## group name otype dclass dim
## 0 / train H5I_GROUP
## 1 /train axis0 H5I_DATASET STRING 111
## 2 /train axis1 H5I_DATASET INTEGER 1710756
## 3 /train block0_items H5I_DATASET STRING 2
## 4 /train block0_values H5I_DATASET INTEGER 2 x 1710756
## 5 /train block1_items H5I_DATASET STRING 109
## 6 /train block1_values H5I_DATASET FLOAT 109 x 1710756
From there we can observe that the block1_items contains the names of each variables. The block1_values contains the real values we want.
block1_items <- h5read("train.h5",name = "train/block1_items")
block1_values <- h5read("train.h5",name = "train/block1_values")
# Extract the data set "Train"
Train <- as.matrix(block1_values)
Train <- t(Train)
colnames(Train) <- block1_items
Train <- as.data.frame(Train)
dim(Train)
## [1] 1710756 109
Since there are always NA or NaN in the raw dataset, it is neccessary to remove them before real analysis
# Remove the observation that contains the NA or NaN values
good <- apply(Train,1,function(x) sum(is.na(x)))==0
Train <- Train[good,]
dim(Train)
## [1] 223040 109
Check out the correlation between each variables. Remove the ones highly correlated to others, so as to save the further modeling time.
# Check out the correlation between each variables
# png(filename = "FinancialCh.png", width = 4800,height = 4800,units = "px")
rrow <- sample(1:dim(Train)[1],50)
corGraph <- cor(Train[rrow, ])
corrplot(corGraph, order = "FPC", method = "number", type = "lower",
tl.cex = 0.8, tl.col = rgb(0, 0, 0),number.cex = 0.7, number.digits = 2)
# dev.off()
According to the plot, all of the following variables are correlated to other existed variables. So we are going to remove them. The list is shown below: fundamental_61,fundamental_11,fundamental_56,fundamental_26,fundamental_10,fundamental_15,fundamental_57,fundamental_41,fundamental_30,fundamental_53,fundamental_42,fundamental_26,fundamental_10,fundamental_60,fundamental_48,fundamental_55,fundamental_11,fundamental_45,fundamental_16,fundamental_34,fundamental_12,fundamental_51,fundamental_43,fundamental_1,fundamental_42,fundamental_30,fundamental_53
Train <- Train %>%
select(-c(fundamental_61,fundamental_11,fundamental_56,fundamental_26,fundamental_10,fundamental_15,fundamental_57,fundamental_41,fundamental_30,fundamental_53,fundamental_42,fundamental_26,fundamental_10,fundamental_60,fundamental_48,fundamental_55,fundamental_11,fundamental_45,fundamental_16,fundamental_34,fundamental_12,fundamental_51,fundamental_43,fundamental_1,fundamental_42,fundamental_30,fundamental_53))
dim(Train)
## [1] 223040 88
set.seed(2017-01-10)
inTrain <- createDataPartition(Train$y,p=0.7, list=FALSE)
trainSet <- Train[inTrain,]
testSet <- Train[-inTrain,]
ini.fit <- lm(y~., data = trainSet)
# Predict the y of testSet to compare with the real y of testSet
pred_ini.fit <-predict.lm(ini.fit, newdata = testSet,interval = "none")
# R value
R_sq <- 1- (sum((pred_ini.fit-testSet$y)^2))/(sum((testSet$y-mean(testSet$y))^2))
R_val<- sign(R_sq)*sqrt(abs(R_sq))
R_val
## [1] 0.02471166
This R_val is not generalized. It does depends on the set.seed(). In order to get the more accurate estimation of the R-value, we need to send 50 or more random seeds and average the R_val. Now this is just a glance of the R_val. We are going to re-calculate again once the other parameters having been settled down.
Calculate the train_error and test_error for the ini.fit; plot these two errors versus the number of the observation we used. This can give you some ideas about whether the modeling is high bias or high variance.
set.seed(2017-01-10)
R_val <- NULL
J_train <- NULL
J_test <- NULL
for (obs in rw<-c(5000,10000,25000,50000,75000,100000)){
A <- trainSet[1:obs,]
fit <- lm(y~., data = A)
pred_train <-predict.lm(ini.fit, newdata = A,interval = "none")
pred_cv <-predict.lm(ini.fit, newdata = testSet,interval = "none")
i <- which(obs==rw)
R_sq <- 1- (sum((pred_cv-testSet$y)^2))/(sum((testSet$y-mean(testSet$y))^2))
R_val[i] <- sign(R_sq)*sqrt(abs(R_sq))
J_train[i] <- 1/(2*obs)*sum((pred_train-trainSet[1:obs,]$y)^2)
J_test[i] <- 1/(2*obs)*sum((pred_cv-testSet$y)^2)
}
rw <- as.data.frame(rw)
R_val <- as.data.frame(R_val)
J_train <- as.data.frame(J_train)
J_test <- as.data.frame(J_test)
R_val <- cbind(R_val,rw)
J_ <- cbind(J_train,J_test,rw)
J_error <- J_ %>%
gather(type,errors,-rw)
g <- ggplot(data = J_error,aes(x=rw,y=errors,color=type))
g <- g + geom_point() + geom_line()
g <- g + labs(x="numbers of observations")
g <- g + ggtitle("Errors versus the number of observations") +
theme(plot.title = element_text(hjust = 0.5))
g
According to the train_error (green) and test_error (red) lines, they both converged to a quite small value ~0.0001. This means that the model does NOT undergo the high bias or high variance. Since they intersected when the observation number was 75000, so in futuer test we can set the numbers of 75000 for the quick modeling filering.
Not all of the variables are neccessary for a perfect model. Sometimes redundant variables has nothing to do with the accuracy but slow down the computing rate.
set.seed(2017-01-10)
Set <- Train[sample(1:dim(Train)[1], 75000),]
inifit<- lm(y~., data=Set)
#do.call(anova,lm.ls)
bestfit <- step(inifit,direction = "both")
bestfit$call
## lm(formula = y ~ fundamental_5 + fundamental_7 + fundamental_14 +
## fundamental_17 + fundamental_19 + fundamental_20 + fundamental_21 +
## fundamental_31 + fundamental_33 + fundamental_36 + fundamental_40 +
## fundamental_44 + fundamental_46 + fundamental_47 + fundamental_49 +
## fundamental_63 + technical_0 + technical_7 + technical_11 +
## technical_16 + technical_20 + technical_21 + technical_22 +
## technical_27 + technical_30 + technical_32 + technical_36 +
## technical_37 + technical_44, data = Set)
fit_final <- lm(formula = y ~ fundamental_5 + fundamental_7 + fundamental_14 +
fundamental_17 + fundamental_19 + fundamental_20 + fundamental_21 +
fundamental_31 + fundamental_33 + fundamental_36 + fundamental_40 +
fundamental_44 + fundamental_46 + fundamental_47 + fundamental_49 +
fundamental_63 + technical_0 + technical_7 + technical_11 +
technical_16 + technical_20 + technical_21 + technical_22 +
technical_27 + technical_30 + technical_32 + technical_36 +
technical_37 + technical_44, data = trainSet)
pred.train <-predict.lm(fit_final, newdata = trainSet,interval = "none")
pred.cv <-predict.lm(fit_final, newdata = testSet,interval = "none")
J.train <- 1/(2*obs)*sum((pred.train-trainSet$y)^2)
J.test <- 1/(2*obs)*sum((pred.cv-testSet$y)^2)
sprintf("The train_error is %.5f", J.train)
## [1] "The train_error is 0.00022"
sprintf("The test_error is %.5f", J.test)
## [1] "The test_error is 0.00009"
Both the train_error and test_error are close to a small value of 0.0001. This indicates that the model does NOT undergo high bias or high variance.
In order to get more close to the real value in order to decrease the random effect from the data set partition, we set up 100 times partition to obtain 100 variant train.Set and test.Set. The R value will be finally averaged and estimated there.
# Modeling fit
R.val <- rep(0,100)
for (i in 1:100) {
set.seed(i)
inTrain <- createDataPartition(Train$y,p=0.7, list=FALSE)
train.Set <- Train[inTrain,]
test.Set <- Train[-inTrain,]
fit_ <- lm(y~fundamental_5 + fundamental_7 + fundamental_14 +
fundamental_17 + fundamental_19 + fundamental_20 + fundamental_21 +
fundamental_31 + fundamental_33 + fundamental_36 + fundamental_40 +
fundamental_44 + fundamental_46 + fundamental_47 + fundamental_49 +
fundamental_63 + technical_0 + technical_7 + technical_11 +
technical_16 + technical_20 + technical_21 + technical_22 +
technical_27 + technical_30 + technical_32 + technical_36 +
technical_37 + technical_44, data = train.Set, na.action=na.exclude)
# Predict the y of testSet to compare with the real y of testSet
pred.lm <-predict.lm(fit_, newdata = test.Set,interval = "none")
# R value
R.sq <- 1- (sum((pred.lm-test.Set$y)^2))/(sum((test.Set$y-mean(test.Set$y))^2))
R.val[i] <- sign(R.sq)*sqrt(abs(R.sq))
}
df <- sapply(R.val,function(x) mean(sample(x,10,replace = TRUE)))
df <- round(df,4)
round(mean(df))
## [1] 0
hist(df,breaks = 10)